In [3]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point, Polygon
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from libpysal.weights import KNN
from esda.moran import Moran
from pointpats import PoissonPointProcess, k, Window
import hdbscan
import numpy as np
import pysal
import seaborn
pd.options.display.max_columns = None
import seaborn as sns
from astropy.stats import RipleysKEstimator
import geopandas as gpd
from shapely.geometry import Polygon, MultiPolygon
import contextily as ctx
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import entropy
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import itertools
from sklearn.metrics import silhouette_score
from geopy.distance import geodesic
import warnings
warnings.filterwarnings('ignore')

Helper Functions¶

In [4]:
def near_station(centroid, station_df, station_buffer=500, geo_col='geometry', name_col='name'):
    distances = {}
    for i in range(len(station_df)):
        station_name = station_df[name_col].iloc[i]
        station_point = station_df[geo_col].iloc[i]  # Shapely Point
        # geodesic expects (lat, lon)
        dist_m = geodesic(
            (centroid.y, centroid.x), 
            (station_point.y, station_point.x)
        ).meters
        distances[station_name] = dist_m
    
    return any(d <= station_buffer for d in distances.values()), min(distances.values()), distances

def compute_entropy(counts):
    probs = counts / counts.sum()
    return entropy(probs)

def compute_gini(counts):
    if len(counts) == 1:
        return 1
    probs = counts / counts.sum()
    sorted_probs = np.sort(probs)
    n = len(probs)
    cum = np.cumsum(sorted_probs)
    gini = 1 - (2 / (n - 1)) * np.sum(cum) + (1 / n)
    return gini

def classify_row(row, final_gdf_clustered, ent_thresh=0.7, entrop_col = 'entropy_norm', mode = 'sector_number'):
    if mode == 'sector_number':
        ent_thresh = 0.4 * np.log2(final_gdf_clustered[(final_gdf_clustered['year'] == row['year']) &
                                                   (final_gdf_clustered['scat_category'] != 'Other')]['scat_category'].nunique())
        lower_thresh = 0.2 * np.log2(final_gdf_clustered[(final_gdf_clustered['year'] == row['year']) &
                                                   (final_gdf_clustered['scat_category'] != 'Other')]['scat_category'].nunique())
    elif mode == 'entropy_quantile':
        ent_thresh = final_gdf_clustered[final_gdf_clustered['year'] == row['year']][entrop_col].quantile(ent_thresh)
        lower_thresh = final_gdf_clustered[final_gdf_clustered['year'] == row['year']][entrop_col].quantile(1 - ent_thresh)
    elif mode == 'entropy_median':
        ent_thresh = final_gdf_clustered[final_gdf_clustered['year'] == row['year']][entrop_col].median()
        lower_thresh = ent_thresh
    else:
        lower_thresh = 1 - ent_thresh
    if row[entrop_col] >= ent_thresh:
        return 'High diversity'
    elif row[entrop_col] < lower_thresh:
        return 'Homogeneous'
    else:
        return 'Moderate diversity'

def groupby_multipoly(df, by, aggfunc="first"):
    data = df.drop(labels=df.geometry.name, axis=1)
    aggregated_data = data.groupby(by=by).agg(aggfunc)

    # Process spatial component
    def merge_geometries(block):
        return MultiPolygon(block.values)

    g = df.groupby(by=by, group_keys=False)[df.geometry.name].agg(
        merge_geometries
    )
    aggregated_geometry = gpd.GeoDataFrame(g, geometry=df.geometry.name, crs=df.crs).reset_index()
    return aggregated_geometry
    
def load_data(path, year_col='year', lat_col='latitude', lon_col='longitude'):
    df = pd.read_pickle(path)
    df['geometry'] = df.apply(lambda row: Point(row[lon_col], row[lat_col]), axis=1)
    gdf = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')
    return gdf.to_crs('EPSG:27700')

def compute_morans_i(gdf, attribute, k = 8):
    if not k:
        k = 8
    k = k if len(gdf) > k else len(gdf) - 1
    w = KNN.from_dataframe(gdf, k=k)
    w.transform = 'R'
    moran = Moran(gdf[attribute], w)
    return moran.I, moran.p_sim

def compute_ripleys_k(gdf, max_dist=100, mode='none', draw_plot = False):
    coords = np.array([(geom.x, geom.y) for geom in gdf.geometry])
    bounds = gdf.total_bounds
    area = (bounds[2] - bounds[0]) * (bounds[3] - bounds[1])
    Kest = RipleysKEstimator(area=area, x_max=bounds[2], y_max=bounds[3], x_min=bounds[0], y_min=bounds[1])
    r = np.linspace(0, int((area)**0.5), max_dist)
    reply_data = Kest(data=coords, radii=r, mode = mode)
    poisson_data = Kest.poisson(r)
    
    reply_df = pd.concat([pd.DataFrame({'radius' : r, 'reply' : poisson_data, 'type' : ['Poisson'] * len(poisson_data)}),
                          pd.DataFrame({'radius' : r, 'reply' : reply_data, 'type' : ['Reply_K'] * len(reply_data)})],
                         ignore_index = True)
    if draw_plot:
        sns.lineplot(reply_df, x = 'radius', y = 'reply', hue = 'type')
    return reply_df

def apply_dbscan(gdf, eps=300, min_samples=5):
    coords = gdf.geometry.apply(lambda p: (p.x, p.y)).tolist()
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(coords)
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X_scaled)
    gdf['cluster'] = db.labels_
    return gdf

def apply_hdbscan(gdf, metric='euclidean'):
    min_cluster_sizes = [15, 25, 35, 50, 60, 80, 100]
    epsilons = [0, 50, 100, 150, 200]
    
    best_score = -1
    best_params = None
    results = []
    coords = gdf.geometry.apply(lambda p: (p.x, p.y)).tolist()
    for mcs, eps in itertools.product(min_cluster_sizes, epsilons):
        clusterer = hdbscan.HDBSCAN(min_cluster_size = mcs, metric = metric, cluster_selection_epsilon = eps)
        labels = clusterer.fit_predict(coords)
        if np.all(labels == -1):
            continue    
        if len(set(labels)) > 1:
            sil_score = silhouette_score(coords, labels)
        else:
            sil_score = -1
    
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        noise_pct = np.sum(labels == -1) / len(labels) * 100
    
        results.append({
            'min_cluster_size': mcs,
            'epsilon_m': eps,
            'silhouette': sil_score,
            'n_clusters': n_clusters,
            'noise_pct': noise_pct
        })
        if sil_score > best_score:
            best_score = sil_score
            best_params = (mcs, eps)
    print(f"After trying the ranges for min and epis, the best values are {best_params} with score is {best_score}")
    clusterer = hdbscan.HDBSCAN(min_cluster_size = best_params[0], metric = metric, cluster_selection_epsilon = best_params[1])
    gdf['cluster'] = clusterer.fit_predict(coords)
    return gdf, pd.DataFrame(results)

def gini_index(series):
    proportions = series.value_counts(normalize=True)
    return 1 - (proportions ** 2).sum()

def analyze_over_time(gdf_all, time_col='year', attribute='employee_count', industry = None, industry_col = 'scat_category',
                     min_cluster_size = None, min_cluster_percent = 0.02, min_cluster_size_limit = 50, k_morans_i = 8,
                     reply_draw_plot = False, cluster_draw_plot = False, metric='euclidean'):
    if industry:
        gdf_all_tmp = gdf_all[gdf_all[industry_col] == industry].copy()
        title = f"Business Clusters - {industry}"
    else:
        gdf_all_tmp = gdf_all.copy()
        title = f"Business Clusters"
    years = sorted(gdf_all_tmp[time_col].unique())
    results = []
    cluster_sil_scores = []

    for year in years:
        gdf_year = gdf_all_tmp[gdf_all_tmp[time_col] == year].copy()
        print(gdf_year.shape)
        gdf_year_oa = gdf_year.groupby(['oa21cd', 'oa_lat', 'oa_long']).count()[['CompanyID']].reset_index().rename(
            columns = {'CompanyID' : 'bus_count'})
        print(gdf_year_oa.shape)
        gdf_year_oa['geometry'] = gdf_year_oa.apply(lambda row: Point(row['oa_long'], row['oa_lat']), axis=1)
        gdf_year_oa = gpd.GeoDataFrame(gdf_year_oa, geometry='geometry', crs='EPSG:4326')
        gdf_year_oa = gdf_year_oa.to_crs('EPSG:27700')
        moran_i, p = compute_morans_i(gdf_year_oa, 'bus_count', k = k_morans_i)
        reply_df = compute_ripleys_k(gdf_year, draw_plot = reply_draw_plot)
        gdf_clustered, sil_score = apply_hdbscan(gdf_year, metric = metric)
        sil_score['year'] = year
        cluster_sil_scores.append(sil_score)

        cluster_trends = gdf_clustered.groupby(['year', 'cluster']).size().reset_index(name='count')
        cluster_trends_pivot = cluster_trends.pivot(index='year', columns='cluster', values='count').fillna(0)
        
        sector_dist = gdf_clustered.groupby(['year', 'cluster', 'scat_category']).size().reset_index(name='count')
        sector_dist['proportion'] = sector_dist.groupby(['year', 'cluster'])['count'].transform(lambda x: x / x.sum())
        dominant_sectors = sector_dist.sort_values(['year', 'cluster', 'proportion'], ascending=[True, True, False]) \
                               .groupby(['year', 'cluster']).first().reset_index()

        print(f"\nYear: {year}")
        print(f"Moran's I: {moran_i:.4f}, p-value: {p:.4f}")
        print(f"Clusters found: {gdf_clustered['cluster'].nunique() - (1 if -1 in gdf_clustered['cluster'].unique() else 0)}")

        results.append({
            'year': year,
            'moran_i': moran_i,
            'moran_p': p,
            'num_clusters': gdf_clustered['cluster'].nunique() - (1 if -1 in gdf_clustered['cluster'].unique() else 0),
            'clustered_gdf': gdf_clustered,
            'reply_df': reply_df,
            'cluster_trends' : cluster_trends,
            'cluster_trends_pivot' : cluster_trends_pivot,
            'sector_dist' : sector_dist,
            'dominant_sectors' : dominant_sectors
        })

        if cluster_draw_plot:
            fig, ax = plt.subplots(figsize=(6,6))
            gdf_clustered.plot(column='cluster', cmap='tab20', legend=True, ax=ax)
            plt.title(title + f" - {year}")
            plt.show()

    cluster_sil_scores = pd.concat(cluster_sil_scores, ignore_index = True)
    final_gdf_clustered = pd.concat([c['clustered_gdf'] for c in results], ignore_index = True)
    final_gdf_clustered = final_gdf_clustered.sort_values(['CompanyID', 'year'])
    final_gdf_clustered['prev_cluster'] = final_gdf_clustered.groupby('CompanyID')['cluster'].shift(1)
    final_gdf_clustered['cluster_change'] = final_gdf_clustered['cluster'] != final_gdf_clustered['prev_cluster']
    transitions = final_gdf_clustered.dropna(subset=['prev_cluster']) \
                           .groupby(['prev_cluster', 'cluster']).size().reset_index(name='count')

    cluster_trends = final_gdf_clustered.groupby(['year', 'cluster']).size().reset_index(name='count')
    cluster_trends['proportion'] = cluster_trends.groupby(['year'])['count'].transform(lambda x: x / x.sum())
    cluster_trends_pivot = cluster_trends.pivot(index='year', columns='cluster', values='count').fillna(0)
    cluster_proportion_trends_pivot = cluster_trends.pivot(index='year', columns='cluster', values='proportion').fillna(0)
    
    sector_dist = final_gdf_clustered.groupby(['year', 'cluster', 'scat_category']).size().reset_index(name='count')
    sector_dist['proportion'] = sector_dist.groupby(['year', 'cluster'])['count'].transform(lambda x: x / x.sum())
    dominant_sectors = sector_dist.sort_values(['year', 'cluster', 'proportion'], ascending=[True, True, False]) \
                           .groupby(['year', 'cluster']).first().reset_index()
    
    access_dist = final_gdf_clustered.groupby(['year', 'cluster', 'PTAL2023_main']).size().reset_index(name='count')
    access_dist['proportion'] = access_dist.groupby(['year', 'cluster'])['count'].transform(lambda x: x / x.sum())
    dominant_access = access_dist.sort_values(['year', 'cluster', 'proportion'], ascending=[True, True, False]) \
                           .groupby(['year', 'cluster']).first().reset_index()

    gini_by_cluster = final_gdf_clustered.groupby(['year', 'cluster'])['scat_category'].apply(gini_index).reset_index()
    gini_by_cluster.columns = ['year', 'cluster', 'gini_index']

    access_gini_by_cluster = final_gdf_clustered.groupby(['year', 'cluster'])['PTAL2023_main'].apply(gini_index).reset_index()
    access_gini_by_cluster.columns = ['year', 'cluster', 'gini_index']
    
    return results, cluster_sil_scores, final_gdf_clustered, transitions, cluster_trends, cluster_trends_pivot, cluster_proportion_trends_pivot, \
            sector_dist, dominant_sectors, access_dist, dominant_access, gini_by_cluster, access_gini_by_cluster

Load Business Dataset, VNEB, OKR and OAs Bounderies¶

In [8]:
oas = "Data\\OA_2021_EW_BFE_V9.shp"
oas_gdf = gpd.read_file(oas)
oas_gdf = oas_gdf[oas_gdf['LSOA21NM'].apply(lambda x: ('wandsworth' in x.lower()) or ('lambeth' in x.lower()))]
oas_gdf["area_m2"] = oas_gdf.geometry.area

##############
ptal_path = "Data\\PTAL_2023_Grid_100m_100m.shp"
ptal_df = gpd.read_file(ptal_path)

##############
oa_area = 'Data\\Opportunity_Areas.shp'
oa_area_gdf = gpd.read_file(oa_area)
wand_oa_area_gdf = oa_area_gdf[oa_area_gdf['borough'] == 'Lambeth,  Wandsworth'].copy()
wand_oa_area_gdf = wand_oa_area_gdf.to_crs(oas_gdf.crs)

################
print(oas_gdf.shape)
oas_area_wandsworth = oas_gdf.sjoin(wand_oa_area_gdf, predicate = 'intersects'
                                   ).drop(columns = ['index_right'])
oas_area_wandsworth.columns = [c.lower() for c in oas_area_wandsworth.columns]
oas_area_wandsworth = oas_area_wandsworth.rename(columns = {'long' : 'oa_long', 'lat' : 'oa_lat'})
oas_area_wandsworth = gpd.GeoDataFrame(oas_area_wandsworth,
                                   geometry=oas_area_wandsworth['geometry'], crs="EPSG:27700")
print(oas_area_wandsworth.shape)
(2045, 11)
(111, 38)
In [12]:
gdf_all = load_data("Data/final_br_census_oa_df.pkl", year_col='year', lat_col='lat', lon_col='long')
categories = ['Retail', 'Leisure', 'Office', 'Industrial']
tmp1 = gdf_all[(gdf_all['source'] == 'Census') & (gdf_all['year'] == 2012)].copy()
tmp1['year'] = 2011
tmp2 = gdf_all[(gdf_all['source'] == 'Both') & (gdf_all['year'] == 2012)].copy()
tmp2['year'] = 2011
tmp3 = gdf_all[(gdf_all['source'] == 'Census') & (gdf_all['year'] == 2012)].copy()
tmp3['year'] = 2010
tmp4 = gdf_all[(gdf_all['source'] == 'Both') & (gdf_all['year'] == 2012)].copy()
tmp4['year'] = 2010
gdf_all = pd.concat([gdf_all, tmp1, tmp2, tmp3, tmp4])
gdf_all['scat_category'] = gdf_all['scat_category'].apply(lambda x: x if x in categories else 'Other')
In [13]:
final_pop_oa_df = pd.read_pickle("Data/final_pop_oa_df.pkl")
oa_final_df = pd.read_pickle("Data/VNEB_oa_final_df.pkl")
oa_final_df['bus_density'] = (oa_final_df['bus_count'] / oa_final_df['area_m2'] * 1000000).apply(np.ceil)

Create Stations Dataframe¶

In [14]:
battersea_station = (51.479774, -0.1418745)  # lat, lon
nine_elms = (51.4799781, -0.1285638)

stations_df = gpd.GeoDataFrame({
    'name': ['Battersea Power Station', 'Nine Elms'],
    'geometry': [Point(battersea_station[1], battersea_station[0]),  # lon, lat
                 Point(nine_elms[1], nine_elms[0])]
}, crs='EPSG:4326')
stations_df_tmp = stations_df.to_crs(epsg=3857)
stations_df
Out[14]:
name geometry
0 Battersea Power Station POINT (-0.14187 51.47977)
1 Nine Elms POINT (-0.12856 51.47998)

Plot Overaall and Industry-Specific Business Density¶

In [6]:
years = [2012, 2015, 2018, 2021, 2024]
industry = 'All'

combined_df = oa_final_df[(oa_final_df['year'].isin(years)) &
                          (oa_final_df['industry'] == industry)].copy()

oas_combined = oas_area_wandsworth.set_index('oa21cd').join(
    combined_df.set_index('oa21cd')[['bus_count']]
).reset_index()
oas_combined = oas_combined.to_crs(epsg=3857)

vmin = oas_combined['bus_count'].min()
vmax = oas_combined['bus_count'].max()
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
cmap = cm.viridis

fig, axes = plt.subplots(2, 3, figsize=(38, 30), sharex=True, sharey=True)
axes = axes.flatten()

for i, year in enumerate(years):
    ax = axes[i]

    oa_final_df_year = oa_final_df[(oa_final_df['year'] == year) &
                                   (oa_final_df['industry'] == industry)].copy()

    oas_year = oas_area_wandsworth.set_index('oa21cd').join(
        oa_final_df_year.set_index('oa21cd')[['bus_density', 'bus_count']]
    ).reset_index()
    oas_year = oas_year.to_crs(epsg=3857)

    oas_year.plot(
        ax=ax,
        column='bus_count',
        cmap=cmap,
        norm=norm,
        edgecolor='white',
        linewidth=0.5,
        legend=False
    )

    ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik, crs=oas_year.crs)
    ax.set_title(f'Business Count in {year}', fontsize=24, weight='bold')
    ax.set_axis_off()

axes[-1].axis('off')
sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm._A = []
cbar = fig.colorbar(sm, ax=axes[:5], orientation='horizontal', fraction=0.02, pad=0)
cbar.set_label('Business Count', fontsize=16)

plt.suptitle("Wandsworth OA Business Count by Year", fontsize=30, weight='bold', y=1)
plt.tight_layout(rect=[0, 0.1, 1, 1])
plt.show()
No description has been provided for this image
In [7]:
years = [2012, 2015, 2018, 2021, 2024]
industry = 'Office'

combined_df = oa_final_df[(oa_final_df['year'].isin(years)) &
                          (oa_final_df['industry'] == industry)].copy()

oas_combined = oas_area_wandsworth.set_index('oa21cd').join(
    combined_df.set_index('oa21cd')[['bus_count']]
).reset_index()
oas_combined = oas_combined.to_crs(epsg=3857)

vmin = oas_combined['bus_count'].min()
vmax = oas_combined['bus_count'].max()
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
cmap = cm.viridis

fig, axes = plt.subplots(2, 3, figsize=(38, 30), sharex=True, sharey=True)
axes = axes.flatten()

for i, year in enumerate(years):
    ax = axes[i]

    # Filter data for the specific year and industry
    oa_final_df_year = oa_final_df[(oa_final_df['year'] == year) &
                                   (oa_final_df['industry'] == industry)].copy()

    oas_year = oas_area_wandsworth.set_index('oa21cd').join(
        oa_final_df_year.set_index('oa21cd')[['bus_density', 'bus_count']]
    ).reset_index()
    oas_year = oas_year.to_crs(epsg=3857)

    oas_year.plot(
        ax=ax,
        column='bus_count',
        cmap=cmap,
        norm=norm,
        edgecolor='white',
        linewidth=0.5,
        legend=False
    )

    ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik, crs=oas_year.crs)
    ax.set_title(f'Business Count in {year}', fontsize=24, weight='bold')
    ax.set_axis_off()

axes[-1].axis('off')

sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm._A = []
cbar = fig.colorbar(sm, ax=axes[:5], orientation='horizontal', fraction=0.02, pad=0)
cbar.set_label('Business Count', fontsize=16)

plt.suptitle("Wandsworth OA Business Count by Year", fontsize=30, weight='bold', y=1)
plt.tight_layout(rect=[0, 0.1, 1, 1])
plt.show()
No description has been provided for this image
In [8]:
years = [2012, 2015, 2018, 2021, 2024]
industry = 'Retail'

combined_df = oa_final_df[(oa_final_df['year'].isin(years)) &
                          (oa_final_df['industry'] == industry)].copy()

oas_combined = oas_area_wandsworth.set_index('oa21cd').join(
    combined_df.set_index('oa21cd')[['bus_count']]
).reset_index()
oas_combined = oas_combined.to_crs(epsg=3857)

vmin = oas_combined['bus_count'].min()
vmax = oas_combined['bus_count'].max()
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
cmap = cm.viridis

fig, axes = plt.subplots(2, 3, figsize=(38, 30), sharex=True, sharey=True)
axes = axes.flatten()

for i, year in enumerate(years):
    ax = axes[i]

    oa_final_df_year = oa_final_df[(oa_final_df['year'] == year) &
                                   (oa_final_df['industry'] == industry)].copy()

    oas_year = oas_area_wandsworth.set_index('oa21cd').join(
        oa_final_df_year.set_index('oa21cd')[['bus_density', 'bus_count']]
    ).reset_index()
    oas_year = oas_year.to_crs(epsg=3857)

    oas_year.plot(
        ax=ax,
        column='bus_count',
        cmap=cmap,
        norm=norm,
        edgecolor='white',
        linewidth=0.5,
        legend=False
    )

    ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik, crs=oas_year.crs)
    ax.set_title(f'Business Count in {year}', fontsize=24, weight='bold')
    ax.set_axis_off()

axes[-1].axis('off')

sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm._A = []
cbar = fig.colorbar(sm, ax=axes[:5], orientation='horizontal', fraction=0.02, pad=0)
cbar.set_label('Business Count', fontsize=16)

plt.suptitle("Wandsworth OA Business Count by Year", fontsize=30, weight='bold', y=1)
plt.tight_layout(rect=[0, 0.1, 1, 1])
plt.show()
No description has been provided for this image
In [9]:
years = [2012, 2015, 2018, 2021, 2024]
industry = 'Leisure'

combined_df = oa_final_df[(oa_final_df['year'].isin(years)) &
                          (oa_final_df['industry'] == industry)].copy()

oas_combined = oas_area_wandsworth.set_index('oa21cd').join(
    combined_df.set_index('oa21cd')[['bus_count']]
).reset_index()
oas_combined = oas_combined.to_crs(epsg=3857)

vmin = oas_combined['bus_count'].min()
vmax = oas_combined['bus_count'].max()
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
cmap = cm.viridis

fig, axes = plt.subplots(2, 3, figsize=(38, 30), sharex=True, sharey=True)
axes = axes.flatten()

for i, year in enumerate(years):
    ax = axes[i]

    oa_final_df_year = oa_final_df[(oa_final_df['year'] == year) &
                                   (oa_final_df['industry'] == industry)].copy()

    oas_year = oas_area_wandsworth.set_index('oa21cd').join(
        oa_final_df_year.set_index('oa21cd')[['bus_density', 'bus_count']]
    ).reset_index()
    oas_year = oas_year.to_crs(epsg=3857)

    oas_year.plot(
        ax=ax,
        column='bus_count',
        cmap=cmap,
        norm=norm,
        edgecolor='white',
        linewidth=0.5,
        legend=False
    )

    ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik, crs=oas_year.crs)
    ax.set_title(f'Business Count in {year}', fontsize=24, weight='bold')
    ax.set_axis_off()

axes[-1].axis('off')

sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm._A = []
cbar = fig.colorbar(sm, ax=axes[:5], orientation='horizontal', fraction=0.02, pad=0)
cbar.set_label('Business Count', fontsize=16)

plt.suptitle("Wandsworth OA Business Count by Year", fontsize=30, weight='bold', y=1)
plt.tight_layout(rect=[0, 0.1, 1, 1])
plt.show()
No description has been provided for this image
In [10]:
years = [2012, 2015, 2018, 2021, 2024]
industry = 'Industrial'

combined_df = oa_final_df[(oa_final_df['year'].isin(years)) &
                          (oa_final_df['industry'] == industry)].copy()

oas_combined = oas_area_wandsworth.set_index('oa21cd').join(
    combined_df.set_index('oa21cd')[['bus_count']]
).reset_index()
oas_combined = oas_combined.to_crs(epsg=3857)

vmin = oas_combined['bus_count'].min()
vmax = oas_combined['bus_count'].max()
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
cmap = cm.viridis

fig, axes = plt.subplots(2, 3, figsize=(38, 30), sharex=True, sharey=True)
axes = axes.flatten()

for i, year in enumerate(years):
    ax = axes[i]

    oa_final_df_year = oa_final_df[(oa_final_df['year'] == year) &
                                   (oa_final_df['industry'] == industry)].copy()

    oas_year = oas_area_wandsworth.set_index('oa21cd').join(
        oa_final_df_year.set_index('oa21cd')[['bus_density', 'bus_count']]
    ).reset_index()
    oas_year = oas_year.to_crs(epsg=3857)

    oas_year.plot(
        ax=ax,
        column='bus_count',
        cmap=cmap,
        norm=norm,
        edgecolor='white',
        linewidth=0.5,
        legend=False
    )

    ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik, crs=oas_year.crs)
    ax.set_title(f'Business Count in {year}', fontsize=24, weight='bold')
    ax.set_axis_off()

axes[-1].axis('off')

sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm._A = []
cbar = fig.colorbar(sm, ax=axes[:5], orientation='horizontal', fraction=0.02, pad=0)
cbar.set_label('Business Count', fontsize=16)

plt.suptitle("Wandsworth OA Business Count by Year", fontsize=30, weight='bold', y=1)
plt.tight_layout(rect=[0, 0.1, 1, 1])
plt.show()
No description has been provided for this image
In [11]:
final_pop_oa_df[['year', 'total_pop', 'pop_19_64']].groupby(['year']).sum()

stations_df = gpd.GeoDataFrame({
    'name': ['Battersea Power Station', 'Nine Elms'],
    'geometry': [Point(battersea_station[1], battersea_station[0]),  # lon, lat
                 Point(nine_elms[1], nine_elms[0])]
}, crs='EPSG:4326')
In [12]:
filtered_jobs_df = pd.read_pickle("filtered_jobs_df.pkl")
filtered_jobs_df = filtered_jobs_df[filtered_jobs_df['industry'] == 'All']
In [13]:
buffer = 750
buffer_stations_df = stations_df_tmp.copy()
buffer_stations_df = buffer_stations_df.to_crs(gdf_all.crs)
buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6 
tmp = oa_final_df[oa_final_df['industry'] == 'All'].copy()
tmp['geometry'] = tmp.apply(lambda x: Point(x['oa_long'], x['oa_lat']), axis = 1)
tmp = gpd.GeoDataFrame(tmp, geometry = tmp['geometry'], crs = 'EPSG:4326')
tmp = tmp.to_crs(gdf_all.crs)
joined = tmp.sjoin(buffer_stations_df, predicate='within')
oas = joined['oa21cd'].unique()
t = filtered_jobs_df[filtered_jobs_df['oa21cd'].isin(oas)].groupby('year').sum()[['jobs_count']]
t1 = final_pop_oa_df[final_pop_oa_df['oa21cd'].isin(oas)].groupby('year').sum()[['total_pop','pop_19_64']]
t = t.join(t1).reset_index()
t['job_percent'] = t['jobs_count'] / t['total_pop']
t['job_percent1'] = t['jobs_count'] / t['pop_19_64']

Run HDBSCAN Clustering Overtime¶

In [15]:
gdf_all_tmp = gdf_all.to_crs("EPSG:4326")

results, cluster_silh_df, final_gdf_clustered, transitions, cluster_trends, cluster_trends_pivot, cluster_proportion_trends_pivot, \
    sector_dist, dominant_sectors, access_dist, dominant_access, gini_by_cluster, access_gini_by_cluster = analyze_over_time(
        gdf_all_tmp, metric='haversine')
(1654, 54)
(96, 4)
After trying the ranges for min and epis, the best values are (15, 0) with score is 0.5206878326553299

Year: 2010
Moran's I: 0.0363, p-value: 0.1290
Clusters found: 34
(1697, 54)
(96, 4)
After trying the ranges for min and epis, the best values are (15, 0) with score is 0.5091418924892184

Year: 2011
Moran's I: 0.0407, p-value: 0.1240
Clusters found: 34
(1694, 54)
(96, 4)
After trying the ranges for min and epis, the best values are (15, 0) with score is 0.49652268697254526

Year: 2012
Moran's I: 0.0410, p-value: 0.1180
Clusters found: 33
(1897, 54)
(96, 4)
After trying the ranges for min and epis, the best values are (15, 0) with score is 0.5106155314098126

Year: 2013
Moran's I: 0.0778, p-value: 0.0430
Clusters found: 36
(1940, 54)
(97, 4)
After trying the ranges for min and epis, the best values are (15, 0) with score is 0.4812629159403616

Year: 2014
Moran's I: 0.0869, p-value: 0.0280
Clusters found: 35
(2004, 54)
(97, 4)
After trying the ranges for min and epis, the best values are (15, 0) with score is 0.449516854287888

Year: 2015
Moran's I: 0.0842, p-value: 0.0330
Clusters found: 40
(2120, 54)
(99, 4)
After trying the ranges for min and epis, the best values are (15, 0) with score is 0.41707546042537463

Year: 2016
Moran's I: 0.0710, p-value: 0.0390
Clusters found: 44
(2530, 54)
(104, 4)
After trying the ranges for min and epis, the best values are (15, 0) with score is 0.4452814908612487

Year: 2017
Moran's I: 0.0687, p-value: 0.0440
Clusters found: 48
(2653, 54)
(105, 4)
After trying the ranges for min and epis, the best values are (15, 0) with score is 0.4630512364864433

Year: 2018
Moran's I: 0.0553, p-value: 0.0780
Clusters found: 53
(2966, 54)
(110, 4)
After trying the ranges for min and epis, the best values are (15, 0) with score is 0.4367253113781164

Year: 2019
Moran's I: 0.0252, p-value: 0.1820
Clusters found: 58
(3018, 54)
(110, 4)
After trying the ranges for min and epis, the best values are (15, 0) with score is 0.44007158394547174

Year: 2020
Moran's I: 0.0160, p-value: 0.2560
Clusters found: 61
(3872, 54)
(111, 4)
After trying the ranges for min and epis, the best values are (15, 0) with score is 0.5013442274578016

Year: 2021
Moran's I: -0.0077, p-value: 0.4380
Clusters found: 69
(4207, 54)
(111, 4)
After trying the ranges for min and epis, the best values are (15, 0) with score is 0.4905271081684077

Year: 2022
Moran's I: -0.0090, p-value: 0.4370
Clusters found: 74
(4718, 54)
(111, 4)
After trying the ranges for min and epis, the best values are (15, 0) with score is 0.5347284507838003

Year: 2023
Moran's I: -0.0150, p-value: 0.4980
Clusters found: 83
(5336, 54)
(111, 4)
After trying the ranges for min and epis, the best values are (15, 0) with score is 0.5672692177179313

Year: 2024
Moran's I: -0.0163, p-value: 0.4910
Clusters found: 95

Plot Business Densities, Cluster Counts and Densities and CLusters around NLE Stations¶

In [19]:
buffer = 500
buffer_stations_df = stations_df_tmp.copy()
buffer_stations_df = buffer_stations_df.to_crs(final_gdf_clustered.crs)
buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6 
joined = gpd.sjoin(final_gdf_clustered, buffer_stations_df, how='inner', predicate='within')
business_counts = joined.groupby(['index_right', 'year']).size().reset_index(name='business_count')

stations_with_density = buffer_stations_df.join(
    business_counts.set_index('index_right')[['business_count', 'year']]).fillna(0).rename(columns = {
    'year' : 'Year', 'name' : 'Station Name'})

stations_with_density['Business Density'] = stations_with_density['business_count'] / stations_with_density['buffer_area_km2']
In [262]:
buffers = [500, 750]
for buffer in buffers:
    buffer_stations_df = stations_df_tmp.copy()
    buffer_stations_df = buffer_stations_df.to_crs(final_gdf_clustered.crs)
    buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
    buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6 
    joined = gpd.sjoin(final_gdf_clustered, buffer_stations_df, how='inner', predicate='within')
    business_counts = joined.groupby(['index_right', 'year']).size().reset_index(name='business_count')
    
    stations_with_density = buffer_stations_df.join(
        business_counts.set_index('index_right')[['business_count', 'year']]).fillna(0).rename(columns = {
        'year' : 'Year', 'name' : 'Station Name'})
    
    stations_with_density['Business Density'] = stations_with_density['business_count'] / stations_with_density['buffer_area_km2']
    sns.lineplot(data=stations_with_density, x='Year', y='Business Density', palette='Set2',
                 hue = 'Station Name', linewidth=2.5, marker='o')
    plt.title(f"Business Density within Stations {buffer}m Buffer")
    plt.show()

for buffer in buffers:
    buffer_stations_df = stations_df_tmp.copy()
    buffer_stations_df = buffer_stations_df.to_crs(final_gdf_clustered.crs)
    buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
    buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6 
    joined = gpd.sjoin(final_gdf_clustered, buffer_stations_df, how='inner', predicate='within')
    joined = joined[joined['scat_category'] != 'Other']
    business_counts = joined.groupby(['index_right', 'year', 'scat_category']).size().reset_index(name='business_count')
    
    stations_with_density = buffer_stations_df.join(
        business_counts.set_index('index_right')[['business_count', 'scat_category', 'year']]).fillna(0).rename(columns = {
        'year' : 'Year', 'name' : 'Station Name', 'scat_category' : 'Sector'})
    stations_with_density['Station Name Sector'] = stations_with_density.apply(lambda x: ' - '.join(
        [x['Station Name'], x['Sector']]), axis = 1)
    
    stations_with_density['Business Density'] = stations_with_density['business_count'] / stations_with_density['buffer_area_km2']
    plt.figure(figsize=(8, 5))
    sns.lineplot(data=stations_with_density, x='Year', y='Business Density', palette='Set2',
                 hue = 'Station Name Sector', linewidth=2.5, marker='o')
    plt.title(f"Business Density within Stations {buffer}m Buffer")
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [253]:
buffer = 500
buffer_stations_df = stations_df_tmp.copy()
buffer_stations_df = buffer_stations_df.to_crs(final_gdf_clustered.crs)
buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6 
joined = gpd.sjoin(final_gdf_clustered, buffer_stations_df, how='inner', predicate='within')
business_counts = joined.groupby(['index_right', 'year']).size().reset_index(name='business_count')

stations_with_density1 = buffer_stations_df.join(
    business_counts.set_index('index_right')[['business_count', 'year']]).fillna(0).rename(columns = {
    'year' : 'Year', 'name' : 'Station Name'})
stations_with_density1['Business Density'] = stations_with_density1['business_count'] / stations_with_density1['buffer_area_km2']
stations_with_density1['Buffer Distance'] = '500m'

buffer = 750
buffer_stations_df = stations_df_tmp.copy()
buffer_stations_df = buffer_stations_df.to_crs(final_gdf_clustered.crs)
buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6 
joined = gpd.sjoin(final_gdf_clustered, buffer_stations_df, how='inner', predicate='within')
business_counts = joined.groupby(['index_right', 'year']).size().reset_index(name='business_count')

stations_with_density2 = buffer_stations_df.join(
    business_counts.set_index('index_right')[['business_count', 'year']]).fillna(0).rename(columns = {
    'year' : 'Year', 'name' : 'Station Name'})
stations_with_density2['Business Density'] = stations_with_density2['business_count'] / stations_with_density2['buffer_area_km2']
stations_with_density2['Buffer Distance'] = '750m'

stations_with_density = pd.concat([stations_with_density1, stations_with_density2])[[
    'Year', 'Buffer Distance', 'Business Density']].groupby(['Year', 'Buffer Distance']).mean().reset_index()

sns.lineplot(data=stations_with_density, x='Year', y='Business Density', palette='Set2', hue = 'Buffer Distance',
             linewidth=2.5, marker='o')
plt.title("Business Density Close to New Stations")
plt.show()
No description has been provided for this image
In [261]:
############################
buffer = 500
buffer_stations_df = stations_df_tmp.copy()
buffer_stations_df = buffer_stations_df.to_crs(final_gdf_clustered.crs)
buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6 
joined = gpd.sjoin(final_gdf_clustered, buffer_stations_df, how='inner', predicate='within')
joined = joined[joined['scat_category'] != 'Other']
business_counts = joined.groupby(['index_right', 'year', 'scat_category']).size().reset_index(name='business_count')

stations_with_density1 = buffer_stations_df.join(
    business_counts.set_index('index_right')[['business_count', 'scat_category', 'year']]).fillna(0).rename(columns = {
    'year' : 'Year', 'name' : 'Station Name', 'scat_category' : 'Sector'})
stations_with_density1['Buffer Distance'] = '500m'
stations_with_density1['Sector Buffer Distance'] = stations_with_density1.apply(lambda x: ' - '.join(
    [x['Sector'], x['Buffer Distance']]), axis = 1)
stations_with_density1['Business Density'] = stations_with_density1['business_count'] / stations_with_density1['buffer_area_km2']

buffer = 750
buffer_stations_df = stations_df_tmp.copy()
buffer_stations_df = buffer_stations_df.to_crs(final_gdf_clustered.crs)
buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6 
joined = gpd.sjoin(final_gdf_clustered, buffer_stations_df, how='inner', predicate='within')
joined = joined[joined['scat_category'] != 'Other']
business_counts = joined.groupby(['index_right', 'year', 'scat_category']).size().reset_index(name='business_count')

stations_with_density2 = buffer_stations_df.join(
    business_counts.set_index('index_right')[['business_count', 'scat_category', 'year']]).fillna(0).rename(columns = {
    'year' : 'Year', 'name' : 'Station Name', 'scat_category' : 'Sector'})
stations_with_density2['Buffer Distance'] = '750m'
stations_with_density2['Sector Buffer Distance'] = stations_with_density2.apply(lambda x: ' - '.join(
    [x['Sector'], x['Buffer Distance']]), axis = 1)
stations_with_density2['Business Density'] = stations_with_density2['business_count'] / stations_with_density2['buffer_area_km2']

stations_with_density = pd.concat([stations_with_density1, stations_with_density2])[[
    'Year', 'Sector Buffer Distance', 'Business Density']].groupby(['Year',
                                     'Sector Buffer Distance']).mean().reset_index()
plt.figure(figsize=(8, 5))
sns.lineplot(data=stations_with_density, x='Year', y='Business Density', palette='Set2', hue = 'Sector Buffer Distance',
             linewidth=2.5, marker='o')
plt.title("Business Density Close to New Stations by Industry")
plt.show()
No description has been provided for this image
In [55]:
t = final_gdf_clustered[['year','cluster']].drop_duplicates()
t = t.sort_values(by = 'year')
t = t.groupby('year').count().rename(columns = {'cluster' : 'Cluster Count'})
plt.figure(figsize=(7, 4))
sns.lineplot(data=t, x='year', y='Cluster Count', palette='Set2', linewidth=2.5, marker='o')
plt.title("Overall VNEB Cluster Counts")
Out[55]:
Text(0.5, 1.0, 'Overall VNEB Cluster Counts')
No description has been provided for this image

Plot Silhoutte Score Overtime and For 2024 Year Over Search Space¶

In [53]:
cluster_silh_df_tmp = cluster_silh_df.copy()
cluster_silh_df_tmp['Hyper Parameters'] = cluster_silh_df_tmp.apply(lambda x: ' - '.join(
    [str(int(x['min_cluster_size'])), str(int(x['epsilon_m']))]), axis = 1)
plt.figure(figsize=(7, 4))
sns.lineplot(data = cluster_silh_df_tmp[(cluster_silh_df_tmp['year'] == 2024)],
             x='Hyper Parameters', y='silhouette', palette='Set2', linewidth=2.5, marker='o')
plt.xticks(rotation=45, ha='right', fontsize=7)
plt.title("Clustering Silhouette Score for 2024")
Out[53]:
Text(0.5, 1.0, 'Clustering Silhouette Score for 2024')
No description has been provided for this image
In [54]:
plt.figure(figsize=(7, 4))
sns.lineplot(data = cluster_silh_df[(cluster_silh_df['min_cluster_size'] == 15) &
                                    (cluster_silh_df['epsilon_m'] == 0)], x='year', y='silhouette', palette='Set2', linewidth=2.5, marker='o')
plt.title("Clustering Silhouette Score over time")
Out[54]:
Text(0.5, 1.0, 'Clustering Silhouette Score over time')
No description has been provided for this image

Generate Other Clusters Plots and Entropy (Diversity Plots)¶

In [27]:
years = [2012, 2015, 2018, 2021, 2024]

sns.lineplot(gini_by_cluster[gini_by_cluster['cluster'] != -1], x = 'year', y = 'gini_index')
plt.title('Business Clusters Sector Gini Index Over Time')
plt.xlabel('Date')
plt.ylabel('Sector Gini Index')
plt.tight_layout()
plt.show()

sns.lineplot(gini_by_cluster[gini_by_cluster['cluster'] != -1].groupby('year').mean()['gini_index'])
plt.title('Average Business Clusters Sector Gini Index Over Time')
plt.xlabel('Date')
plt.ylabel('Average Sector Gini Index')
plt.tight_layout()
plt.show()

sns.lineplot(access_gini_by_cluster[access_gini_by_cluster['cluster'] != -1], x = 'year', y = 'gini_index')
plt.title('Business Clusters Accessibility Gini Index Over Time')
plt.xlabel('Date')
plt.ylabel('Accessibility Gini Index')
plt.tight_layout()
plt.show()

sns.lineplot(access_gini_by_cluster[access_gini_by_cluster['cluster'] != -1].groupby('year').mean()['gini_index'])
plt.title('Average Business Clusters Accessibility Gini Index Over Time')
plt.xlabel('Date')
plt.ylabel('Average Accessibility Gini Index')
plt.tight_layout()
plt.show()

sns.lineplot(cluster_trends[cluster_trends['cluster'] != -1], x = 'year', y = 'count')
plt.title('Business Cluster Sizes Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Businesses')
plt.tight_layout()
plt.show()

sns.lineplot(cluster_trends[cluster_trends['cluster'] == -1], x = 'year', y = 'count')
plt.title('Business Noise Size Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Businesses')
plt.tight_layout()
plt.show()

sns.lineplot(cluster_trends[cluster_trends['cluster'] != -1], x = 'year', y = 'proportion')
plt.title('Business Cluster Proportion Over Time')
plt.xlabel('Date')
plt.ylabel('Proportion of Businesses')
plt.tight_layout()
plt.show()

sns.lineplot(cluster_trends[cluster_trends['cluster'] == -1], x = 'year', y = 'proportion')
plt.title('Business Noise Proportion Over Time')
plt.xlabel('Date')
plt.ylabel('Proportion of Businesses')
plt.tight_layout()
plt.show()

cluster_trends_pivot[cluster_trends_pivot.columns[1:]].plot(figsize=(12,6))
plt.title('Business Cluster Sizes Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Businesses')
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()

cluster_proportion_trends_pivot[cluster_proportion_trends_pivot.columns[1:]].plot(figsize=(12,6))
plt.title('Business Cluster Proportion Over Time')
plt.xlabel('Date')
plt.ylabel('Proportion of Businesses')
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()

pivot = access_gini_by_cluster[(access_gini_by_cluster['cluster'] != -1)]
pivot = pivot.pivot_table(index='year', columns='cluster',
                                values='gini_index', aggfunc='sum').fillna(0)
sns.heatmap(pivot, cmap='viridis', fmt='g')
plt.title('Accessibility Gini Index Distribution Across Clusters')
plt.show()

pivot = gini_by_cluster[(gini_by_cluster['cluster'] != -1)]
pivot = pivot.pivot_table(index='year', columns='cluster',
                                values='gini_index', aggfunc='sum').fillna(0)
sns.heatmap(pivot, cmap='viridis', fmt='g')
plt.title('Sector Gini Index Distribution Across Clusters')
plt.show()

pivot = sector_dist.pivot_table(index='scat_category', columns='cluster',
                                values='count', aggfunc='sum').fillna(0)
sns.heatmap(pivot[pivot.columns[1:]], cmap='viridis', fmt='g')
plt.title('Sector Distribution Across Clusters')
plt.show()

pivot = access_dist.pivot_table(index='PTAL2023_main', columns='cluster',
                                values='count', aggfunc='sum').fillna(0)
sns.heatmap(pivot[pivot.columns[1:]], cmap='viridis', fmt='g')
plt.title('Accessibility Distribution Across Clusters')
plt.show()

fig, axes = plt.subplots(1, 5, figsize=(20, 4))
for i, ax in enumerate(axes):
    year = years[i]
    pivot = sector_dist[sector_dist['year'] == year].pivot_table(index='scat_category', columns='cluster',
                                values='count', aggfunc='sum').fillna(0)
    sns.heatmap(pivot[pivot.columns[1:]], ax=ax, cmap='viridis', cbar=(i == 4), fmt='g')  # Show colorbar only on last
    ax.set_title(f'Sector-Clusters {year}')

fig, axes = plt.subplots(1, 5, figsize=(20, 4))
for i, ax in enumerate(axes):
    year = years[i]
    pivot = access_dist[access_dist['year'] == year].pivot_table(index='PTAL2023_main', columns='cluster',
                                values='count', aggfunc='sum').fillna(0)
    sns.heatmap(pivot[pivot.columns[1:]], ax=ax, cmap='viridis', cbar=(i == 4), fmt='g')  # Show colorbar only on last
    ax.set_title(f'Accessibility-Clusters {year}')

for i, year in enumerate(years):
    df = final_gdf_clustered.sort_values(by=['CompanyID', 'year']).copy()
    df['prev_cluster'] = df.groupby('CompanyID')['cluster'].shift(1)
    df['prev_year'] = df.groupby('CompanyID')['year'].shift(1)
    transitions_tmp = df[df['year'] == year].dropna(subset=['prev_cluster'])
    transitions_tmp['source'] = transitions_tmp['prev_year'].astype(int).astype(str) + '_C' + transitions_tmp['prev_cluster'].astype(int).astype(str)
    transitions_tmp['target'] = transitions_tmp['year'].astype(int).astype(str) + '_C' + transitions_tmp['cluster'].astype(int).astype(str)
    transition_counts = transitions_tmp.groupby(['source', 'target']).size().reset_index(name='count')

    all_nodes = pd.unique(transition_counts[['source', 'target']].values.ravel())
    node_mapping = {label: idx for idx, label in enumerate(all_nodes)}

    transition_counts['source_idx'] = transition_counts['source'].map(node_mapping)
    transition_counts['target_idx'] = transition_counts['target'].map(node_mapping)

    fig = go.Figure(data=[go.Sankey(
            node=dict(
                pad=15,
                thickness=20,
                line=dict(color='black', width=0.5),
                label=all_nodes,
                color='lightblue'),
            link=dict(
                source=transition_counts['source_idx'],
                target=transition_counts['target_idx'],
                value=transition_counts['count']))])
    fig.update_layout(title_text=f"Cluster Transitions Sankey {year}",font_size=12, width=1000, height=500)
    fig.show()
    
fig, axes = plt.subplots(1, 5, figsize=(20, 5), sharex=True, sharey=True)
for i, year in enumerate(years):
    ax = axes[i]
    tmp_df = final_gdf_clustered[final_gdf_clustered['year'] == year].copy()
    if tmp_df.crs != 'EPSG:3857':
        tmp_df = tmp_df.to_crs(epsg=3857)
    tmp_df.plot(column='cluster', cmap='tab20', legend=False, ax=ax, markersize=5)

    ax.scatter(stations_df_tmp.geometry.x, stations_df_tmp.geometry.y, color='blue',marker='*',s=300,label='Station',zorder=5)

    ctx.add_basemap(ax, source=ctx.providers.CartoDB.PositronNoLabels, crs=tmp_df.crs)
    ax.set_title(f'Year: {year}', fontsize=12)
    ax.set_axis_off()

plt.tight_layout()
plt.show()

############################
if stations_df_tmp.crs != 'EPSG:3857':
    stations_df_tmp = stations_df_tmp.to_crs(epsg=3857)

stations_buffer1 = stations_df_tmp.copy()
stations_buffer1['geometry'] = stations_buffer1.buffer(500)
stations_buffer2 = stations_df_tmp.copy()
stations_buffer2['geometry'] = stations_buffer2.buffer(750)

cluster_ids = final_gdf_clustered[final_gdf_clustered['cluster'] != -1]['cluster'].unique()
cluster_ids = sorted(cluster_ids)
n_clusters = len(cluster_ids)

cmap = cm.get_cmap('tab20', n_clusters)
norm = mcolors.BoundaryNorm(boundaries=np.arange(min(cluster_ids)-0.5, max(cluster_ids)+1.5), ncolors=n_clusters)

for year in years:
    fig, ax = plt.subplots(figsize=(8, 5))
    tmp_df = final_gdf_clustered[(final_gdf_clustered['year'] == year) & (final_gdf_clustered['cluster'] != -1)
    ].copy()
    if tmp_df.crs != 'EPSG:3857':
        tmp_df = tmp_df.to_crs(epsg=3857)

    sc = tmp_df.plot(ax=ax, column='cluster', cmap=cmap, markersize=5, legend=False, zorder=3)

    stations_buffer1.plot(ax=ax, color='none', edgecolor='black', linestyle='--', linewidth=1.2, alpha=0.6, zorder=2)
    stations_buffer2.plot(ax=ax, color='none', edgecolor='black', linestyle='--', linewidth=1.2, alpha=0.6, zorder=2)

    stations_df_tmp.plot(ax=ax, color='black', marker='o', markersize=50, label='Station', zorder=4)
    ctx.add_basemap(ax, source=ctx.providers.CartoDB.PositronNoLabels, crs=tmp_df.crs)
    ax.set_title(f'Business Clusters – Year: {year}', fontsize=14)
    ax.set_axis_off()
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    sm.set_array([])  # Only needed for colorbar
    cbar = fig.colorbar(sm, ax=ax, orientation='vertical', shrink=0.6, pad=0.01)
    cbar.set_label('Cluster ID', fontsize=10)
    plt.tight_layout()
    plt.show()

#######################extra analysis##############################
final_gdf_clustered['y'] = final_gdf_clustered.geometry.y
final_gdf_clustered['x'] = final_gdf_clustered.geometry.x

new_clusters = []
metrics = []

for year in final_gdf_clustered['year'].unique():
    df_year = final_gdf_clustered[final_gdf_clustered['year'] == year]
    n_clusters = df_year['cluster'].nunique() - (1 if -1 in df_year['cluster'].unique() else 0)
    cluster_sizes = df_year[df_year['cluster'] != -1].groupby('cluster').size()
    mean_size = cluster_sizes.mean()
    share_clustered = len(cluster_sizes) * mean_size / len(df_year)
    metrics.append({'year': year, 'Clusters Count': n_clusters, 'Cluster Size Average': mean_size,
                    'Clustered Share': share_clustered})
    clusters = df_year.groupby('cluster')
    for label, group in clusters:
        if label == -1: continue  # ignore noise
        centroid = Point(group['x'].mean(), group['y'].mean())
        b, min_dis, distances = near_station(centroid, stations_df, station_buffer = 500)
        b2, min_dis2, distances2 = near_station(centroid, stations_df, station_buffer = 750)
        dic = {'year' : [year], 'cluster' : [label], 'Centroid' : [centroid], 'Min Distance' : [min_dis],
               'Within Buffer' : [b], 'Buffer Distance' : ['500m']}
        dic2 = {'year' : [year], 'cluster' : [label], 'Centroid' : [centroid], 'Min Distance' : [min_dis2],
               'Within Buffer' : [b2], 'Buffer Distance' : ['750m']}
        for k in distances:
            dic[f'{k} Distance'] = [distances[k]]
        for k in distances2:
            dic2[f'{k} Distance'] = [distances2[k]]
        new_clusters.append(pd.DataFrame(dic))
        new_clusters.append(pd.DataFrame(dic2))

new_clusters = pd.concat(new_clusters)
domin_new_clusters = new_clusters.set_index(['year', 'cluster']).join(dominant_sectors.set_index(['year', 'cluster'])).reset_index()
sector_new_clusters = new_clusters.set_index(['year', 'cluster']).join(sector_dist.set_index(['year', 'cluster'])).reset_index()


t = domin_new_clusters[domin_new_clusters['Within Buffer']].groupby(['year', 'Buffer Distance']).count()[['cluster']].reset_index().rename(
                                columns = {'cluster' : 'Cluster Count', 'year' : 'Year'})
tt = domin_new_clusters[domin_new_clusters['Within Buffer']].groupby(['year', 'Buffer Distance', 'scat_category']
                        ).count()[['cluster']].reset_index().rename(columns = {'cluster' : 'Cluster Count',
                        'scat_category' : 'Sector', 'year' : 'Year'})
tt = tt[tt['Sector'] != 'Other']
tt['Sector Buffer Distance'] = tt.apply(lambda x: ' - '.join([x['Sector'], x['Buffer Distance']]), axis = 1)
sns.lineplot(data=t, x='Year', y='Cluster Count', palette='Set2', hue = 'Buffer Distance', linewidth=2.5, marker='o')
plt.title("Cluster Counts Close to New Stations")
plt.show()

sns.lineplot(data=tt, x='Year', y='Cluster Count', hue='Sector Buffer Distance', palette='Set2', linewidth=2.5, marker='o')
plt.title("Cluster Counts Close to New Stations by Sector")
plt.show()

t = sector_new_clusters[sector_new_clusters['Within Buffer']][['year', 'Buffer Distance', 'count']].groupby(['year', 'Buffer Distance']
                 ).sum().reset_index().rename(columns = {'count' : 'Business Count', 'year' : 'Year'})
tt = sector_new_clusters[sector_new_clusters['Within Buffer']][['year', 'Buffer Distance', 'scat_category', 'count']].groupby([
    'year', 'Buffer Distance', 'scat_category']).sum().reset_index().rename(columns = {'count' : 'Business Count',
                                    'scat_category' : 'Sector', 'year' : 'Year'})
tt = tt[tt['Sector'] != 'Other']
tt['Sector Buffer Distance'] = tt.apply(lambda x: ' - '.join([x['Sector'], x['Buffer Distance']]), axis = 1)
sns.lineplot(data=t, x='Year', y='Business Count', palette='Set2', hue = 'Buffer Distance', linewidth=2.5, marker='o')
plt.title("Business Counts Close to New Stations")
plt.show()

sns.lineplot(data=tt, x='Year', y='Business Count', hue='Sector Buffer Distance', palette='Set2', linewidth=2.5, marker='o')
plt.title("Business Counts Close to New Stations by Sector")
plt.show()

sns.lineplot(data=domin_new_clusters[domin_new_clusters['Buffer Distance'] == '500m'], x='year', y='Nine Elms Distance', label='To Nine Elms (500m)')
sns.lineplot(data=domin_new_clusters[domin_new_clusters['Buffer Distance'] == '500m'], x='year', y='Battersea Power Station Distance',
             label='To Battersea Power Station (500m)')
sns.lineplot(data=domin_new_clusters[domin_new_clusters['Buffer Distance'] == '750m'], x='year', y='Nine Elms Distance', label='To Nine Elms (750m)')
sns.lineplot(data=domin_new_clusters[domin_new_clusters['Buffer Distance'] == '750m'], x='year', y='Battersea Power Station Distance',
             label='To Battersea Power Station (750m)')
plt.ylabel("Mean Distance to Station (m)")
plt.title("Cluster Centroid Proximity Over Time")
plt.show()

##############
metrics_df = pd.DataFrame(metrics)

# Plot
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.lineplot(data=metrics_df, x='year', y='Clusters Count', ax=axes[0]).set_title("Number of Clusters")
sns.lineplot(data=metrics_df, x='year', y='Cluster Size Average', ax=axes[1]).set_title("Avg. Cluster Size")
sns.lineplot(data=metrics_df, x='year', y='Clustered Share', ax=axes[2]).set_title("Share of Clustered Businesses")
plt.tight_layout()
plt.show()

############
noise_stats = final_gdf_clustered.groupby('year')['cluster'].apply(lambda x: (x == -1).sum() / len(x)).reset_index()
noise_stats.columns = ['Year', 'Noise Share']

sns.lineplot(data=noise_stats, x='Year', y='Noise Share')
plt.title("Share of Businesses Not in Clusters (Noise)")
plt.ylabel("Proportion")
plt.show()

#################
diversity_data = []
for year in sorted(final_gdf_clustered['year'].unique()):
    df_year = final_gdf_clustered[final_gdf_clustered['year'] == year]
    for label, group in df_year.groupby('cluster'):
        if label == -1: continue
        counts = group['scat_category'].value_counts()
        H = entropy(counts)
        diversity_data.append({'year': year, 'cluster': label, 'entropy': H})

diversity_df = pd.DataFrame(diversity_data)

sns.lineplot(data=diversity_df.groupby('year')['entropy'].mean().reset_index(), x='year', y='entropy')
plt.title("Mean Business Sector Diversity (Entropy) in Clusters Over Time")
plt.ylabel("Shannon Entropy")
plt.show()

diversity_data = []
for year in sorted(final_gdf_clustered['year'].unique()):
    df_year = final_gdf_clustered[final_gdf_clustered['year'] == year]
    for label, group in df_year.groupby('cluster'):
        if label == -1: continue
        counts = group['bus_PTAL_2023'].value_counts()
        H = entropy(counts)
        diversity_data.append({'year': year, 'cluster': label, 'entropy': H})

diversity_df = pd.DataFrame(diversity_data)

sns.lineplot(data=diversity_df.groupby('year')['entropy'].mean().reset_index(), x='year', y='entropy')
plt.title("Mean Business Accessibility Diversity (Entropy) in Clusters Over Time")
plt.ylabel("Shannon Entropy")
plt.show()

###############################
agg_results = []

for year in sorted(final_gdf_clustered['year'].unique()):
    year_df = final_gdf_clustered[final_gdf_clustered['year'] == year]
    for cluster_id, group in year_df.groupby('cluster'):
        if cluster_id == -1: continue  # skip noise
        counts = group[group['scat_category'] != 'Other']['scat_category'].value_counts()
        H = compute_entropy(counts)
        G = compute_gini(counts)
        
        agg_results.append({
            'year': year,
            'cluster': cluster_id,
            'entropy': H,
            'gini': G,
            'n_businesses': len(group),
            'x': group['x'].mean(),
            'y': group['y'].mean()
        })

agg_df = pd.DataFrame(agg_results)

agg_df['entropy_norm'] = agg_df.groupby('year')['entropy'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min())
)
agg_df['gini_norm'] = agg_df.groupby('year')['gini'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min())
)
ent_thresh = agg_df['entropy_norm'].quantile(0.7)
gini_thresh = agg_df['gini_norm'].quantile(0.3)
agg_df['Agglomeration Type'] = agg_df.apply(lambda x: classify_row(x, ent_thresh=gini_thresh, gini_thresh=ent_thresh, entrop_col = 'entropy_norm',
                                            gini_col = 'gini_norm'), axis=1)

trend_df = agg_df.groupby(['year', 'Agglomeration Type']).size().reset_index(name='count').set_index(['year', 'Agglomeration Type'
     ]).join(agg_df[['year', 'Agglomeration Type', 'n_businesses']].groupby(['year', 'Agglomeration Type']).sum()[['n_businesses']]
            ).reset_index()
trend_df = agg_df.groupby(['year']).size().reset_index(name='count').set_index(['year']).join(
    agg_df[['year', 'Agglomeration Type', 'n_businesses']].groupby(['year']).sum()[['n_businesses']]
    ).rename(columns = {'count' : 'Total Count', 'n_businesses': 'Total Businesses'}).join(
    trend_df.set_index('year')).reset_index()

trend_df['Cluster Proportions'] = trend_df['count'] / trend_df['Total Count']
trend_df['Businesses Proportion'] = trend_df['n_businesses'] / trend_df['Total Businesses']

plt.figure(figsize=(10, 6))
sns.lineplot(data=trend_df, x='year', y='count', hue='Agglomeration Type', marker='o')
plt.title("Agglomeration Type Counts Over Time")
plt.ylabel("Number of Clusters")
plt.xlabel("Year")
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
sns.lineplot(data=trend_df, x='year', y='n_businesses', hue='Agglomeration Type', marker='o')
plt.title("Agglomeration Type Clusters Total Businesses Over Time")
plt.ylabel("Number of Businesses")
plt.xlabel("Year")
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
sns.lineplot(data=trend_df, x='year', y='Cluster Proportions', hue='Agglomeration Type', marker='o')
plt.title("Agglomeration Type Clusters Proportion Over Time")
plt.ylabel("Number of Clusters")
plt.xlabel("Year")
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
sns.lineplot(data=trend_df, x='year', y='Businesses Proportion', hue='Agglomeration Type', marker='o')
plt.title("Agglomeration Type Clusters Businesses Proportion Over Time")
plt.ylabel("Number of Clusters")
plt.xlabel("Year")
plt.grid(True)
plt.show()
################

centroid = Point(group['x'].mean(), group['y'].mean())

agg_df['_'], agg_df['Min Distance'], agg_df['Distances'] = zip(*agg_df.apply(lambda x:
                                         near_station(Point(x['x'], x['y']), stations_df), axis=1))

plt.figure(figsize=(8, 6))
sns.boxplot(data=agg_df, x='Agglomeration Type', y='Min Distance')
plt.title("Distance to NLE Stations by Agglomeration Type")
plt.ylabel("Distance (meters)")
plt.xlabel("Agglomeration Type")
plt.show()

#######################business density plots####################
epsg=27700
buffers = [500, 750]
for buffer in buffers:
    buffer_stations_df = stations_df.copy()
    buffer_stations_df = buffer_stations_df.to_crs(epsg)
    buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
    buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6 
    joined = gpd.sjoin(final_gdf_clustered.to_crs(epsg), buffer_stations_df, how='inner', predicate='within')
    business_counts = joined.groupby(['index_right', 'year']).size().reset_index(name='business_count')
    
    stations_with_density = buffer_stations_df.join(
        business_counts.set_index('index_right')[['business_count', 'year']]).fillna(0).rename(columns = {
        'year' : 'Year', 'name' : 'Station Name'})
    
    stations_with_density['Business Density'] = stations_with_density['business_count'] / stations_with_density['buffer_area_km2']
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=stations_with_density, x='Year', y='Business Density', palette='Set2',
                 hue = 'Station Name', linewidth=2.5, marker='o')
    plt.title(f"Business Density within Stations {buffer}m Buffer")
    plt.show()

for buffer in buffers:
    buffer_stations_df = stations_df.copy()
    buffer_stations_df = buffer_stations_df.to_crs(epsg)
    buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
    buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6 
    joined = gpd.sjoin(final_gdf_clustered.to_crs(epsg), buffer_stations_df, how='inner', predicate='within')
    joined = joined[joined['scat_category'] != 'Other']
    business_counts = joined.groupby(['index_right', 'year', 'scat_category']).size().reset_index(name='business_count')
    
    stations_with_density = buffer_stations_df.join(
        business_counts.set_index('index_right')[['business_count', 'scat_category', 'year']]).fillna(0).rename(columns = {
        'year' : 'Year', 'name' : 'Station Name', 'scat_category' : 'Sector'})
    stations_with_density['Station Name Sector'] = stations_with_density.apply(lambda x: ' - '.join(
        [x['Station Name'], x['Sector']]), axis = 1)
    
    stations_with_density['Business Density'] = stations_with_density['business_count'] / stations_with_density['buffer_area_km2']
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=stations_with_density, x='Year', y='Business Density', palette='Set2',
                 hue = 'Station Name Sector', linewidth=2.5, marker='o')
    plt.title(f"Business Density within Stations {buffer}m Buffer")
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[27], line 363
    361 ent_thresh = agg_df['entropy_norm'].quantile(0.7)
    362 gini_thresh = agg_df['gini_norm'].quantile(0.3)
--> 363 agg_df['Agglomeration Type'] = agg_df.apply(lambda x: classify_row(x, ent_thresh=gini_thresh, gini_thresh=ent_thresh, entrop_col = 'entropy_norm',
    364                                             gini_col = 'gini_norm'), axis=1)
    366 trend_df = agg_df.groupby(['year', 'Agglomeration Type']).size().reset_index(name='count').set_index(['year', 'Agglomeration Type'
    367      ]).join(agg_df[['year', 'Agglomeration Type', 'n_businesses']].groupby(['year', 'Agglomeration Type']).sum()[['n_businesses']]
    368             ).reset_index()
    369 trend_df = agg_df.groupby(['year']).size().reset_index(name='count').set_index(['year']).join(
    370     agg_df[['year', 'Agglomeration Type', 'n_businesses']].groupby(['year']).sum()[['n_businesses']]
    371     ).rename(columns = {'count' : 'Total Count', 'n_businesses': 'Total Businesses'}).join(
    372     trend_df.set_index('year')).reset_index()

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pandas\core\frame.py:10034, in DataFrame.apply(self, func, axis, raw, result_type, args, by_row, **kwargs)
  10022 from pandas.core.apply import frame_apply
  10024 op = frame_apply(
  10025     self,
  10026     func=func,
   (...)
  10032     kwargs=kwargs,
  10033 )
> 10034 return op.apply().__finalize__(self, method="apply")

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pandas\core\apply.py:837, in FrameApply.apply(self)
    834 elif self.raw:
    835     return self.apply_raw()
--> 837 return self.apply_standard()

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pandas\core\apply.py:965, in FrameApply.apply_standard(self)
    964 def apply_standard(self):
--> 965     results, res_index = self.apply_series_generator()
    967     # wrap results
    968     return self.wrap_results(results, res_index)

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pandas\core\apply.py:981, in FrameApply.apply_series_generator(self)
    978 with option_context("mode.chained_assignment", None):
    979     for i, v in enumerate(series_gen):
    980         # ignore SettingWithCopy here in case the user mutates
--> 981         results[i] = self.func(v, *self.args, **self.kwargs)
    982         if isinstance(results[i], ABCSeries):
    983             # If we have a view on v, we need to make a copy because
    984             #  series_generator will swap out the underlying data
    985             results[i] = results[i].copy(deep=False)

Cell In[27], line 363, in <lambda>(x)
    361 ent_thresh = agg_df['entropy_norm'].quantile(0.7)
    362 gini_thresh = agg_df['gini_norm'].quantile(0.3)
--> 363 agg_df['Agglomeration Type'] = agg_df.apply(lambda x: classify_row(x, ent_thresh=gini_thresh, gini_thresh=ent_thresh, entrop_col = 'entropy_norm',
    364                                             gini_col = 'gini_norm'), axis=1)
    366 trend_df = agg_df.groupby(['year', 'Agglomeration Type']).size().reset_index(name='count').set_index(['year', 'Agglomeration Type'
    367      ]).join(agg_df[['year', 'Agglomeration Type', 'n_businesses']].groupby(['year', 'Agglomeration Type']).sum()[['n_businesses']]
    368             ).reset_index()
    369 trend_df = agg_df.groupby(['year']).size().reset_index(name='count').set_index(['year']).join(
    370     agg_df[['year', 'Agglomeration Type', 'n_businesses']].groupby(['year']).sum()[['n_businesses']]
    371     ).rename(columns = {'count' : 'Total Count', 'n_businesses': 'Total Businesses'}).join(
    372     trend_df.set_index('year')).reset_index()

TypeError: classify_row() got an unexpected keyword argument 'gini_thresh'

Plot Agglomeration Types Overtime¶

In [175]:
agg_results = []

for year in sorted(final_gdf_clustered['year'].unique()):
    year_df = final_gdf_clustered[final_gdf_clustered['year'] == year]
    for cluster_id, group in year_df.groupby('cluster'):
        if cluster_id == -1: continue  # skip noise
        counts = group[group['scat_category'] != 'Other']['scat_category'].value_counts()
        H = compute_entropy(counts)
        G = compute_gini(counts)
        
        agg_results.append({
            'year': year,
            'cluster': cluster_id,
            'entropy': H,
            'gini': G,
            'n_businesses': len(group),
            'x': group['x'].mean(),
            'y': group['y'].mean()
        })

agg_df = pd.DataFrame(agg_results)

# Normalize entropy between 0–1
agg_df['entropy_norm'] = agg_df.groupby('year')['entropy'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min())
)
agg_df['gini_norm'] = agg_df.groupby('year')['gini'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min())
)

######################
agg_df['Agglomeration Type'] = agg_df.apply(lambda x: classify_row(x, agg_df, entrop_col = 'entropy_norm',
                                                                  mode = 'entropy_quantile'), axis=1)

trend_df = agg_df.groupby(['year', 'Agglomeration Type']).size().reset_index(name='count').set_index(['year', 'Agglomeration Type'
     ]).join(agg_df[['year', 'Agglomeration Type', 'n_businesses']].groupby(['year', 'Agglomeration Type']).sum()[['n_businesses']]
            ).reset_index()
trend_df = agg_df.groupby(['year']).size().reset_index(name='count').set_index(['year']).join(
    agg_df[['year', 'Agglomeration Type', 'n_businesses']].groupby(['year']).sum()[['n_businesses']]
    ).rename(columns = {'count' : 'Total Count', 'n_businesses': 'Total Businesses'}).join(
    trend_df.set_index('year')).reset_index()

trend_df['Cluster Proportions'] = trend_df['count'] / trend_df['Total Count']
trend_df['Businesses Proportion'] = trend_df['n_businesses'] / trend_df['Total Businesses']

plt.figure(figsize=(10, 6))
sns.lineplot(data=trend_df, x='year', y='count', hue='Agglomeration Type', marker='o')
plt.title("Agglomeration Type Counts Over Time")
plt.ylabel("Number of Clusters")
plt.xlabel("Year")
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
sns.lineplot(data=trend_df, x='year', y='n_businesses', hue='Agglomeration Type', marker='o')
plt.title("Agglomeration Type Clusters Total Businesses Over Time")
plt.ylabel("Number of Businesses")
plt.xlabel("Year")
plt.grid(True)
plt.show()

centroid = Point(group['x'].mean(), group['y'].mean())

agg_df['_'], agg_df['Min Distance'], agg_df['Distances'] = zip(*agg_df.apply(lambda x:
                                         near_station(Point(x['x'], x['y']), stations_df), axis=1))

plt.figure(figsize=(8, 6))
sns.boxplot(data=agg_df, x='Agglomeration Type', y='Min Distance')
plt.title("Distance to NLE Stations by Agglomeration Type")
plt.ylabel("Distance (meters)")
plt.xlabel("Agglomeration Type")
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [179]:
import matplotlib.patches as mpatches

for year in years:
    fig, ax = plt.subplots(figsize=(8, 5))
    tmp_df = final_gdf_clustered[(final_gdf_clustered['year'] == year) & (final_gdf_clustered['cluster'] != -1)].copy()
    tmp_agg_df = agg_df[agg_df['year'] == year][['cluster', 'entropy', 'entropy_norm', 'Agglomeration Type']].drop_duplicates().copy()
    tmp_dominant_sectors = dominant_sectors[dominant_sectors['year'] == year].rename(columns = {'scat_category' : 'dom_scat_category'}
                                                                                    ).drop(columns = ['year', 'proportion'])
    tmp_df = tmp_df.set_index(['cluster']).join(tmp_agg_df.set_index('cluster')).join(tmp_dominant_sectors.set_index('cluster')).reset_index()
    tmp_df = tmp_df[(tmp_df['scat_category'] != 'Other')]
    tmp_df['Agglomeration Type Sector'] = tmp_df.apply(lambda x: x['Agglomeration Type'] if x['Agglomeration Type'] in ['High diversity', 'Moderate diversity']
                                                       else ' - '.join([x['Agglomeration Type'], x['scat_category']]), axis = 1)

    tmp_df = tmp_df.sort_values(by = 'Agglomeration Type Sector')
    if tmp_df.crs != 'EPSG:3857':
        tmp_df = tmp_df.to_crs(epsg=3857)

    sc = tmp_df.plot(ax=ax, column='Agglomeration Type Sector', cmap=cmap, markersize=5, legend=False, zorder=3)

    stations_buffer1.plot(ax=ax, color='none', edgecolor='black', linestyle='--', linewidth=1.2, alpha=0.6, zorder=2)
    stations_buffer2.plot(ax=ax, color='none', edgecolor='black', linestyle='--', linewidth=1.2, alpha=0.6, zorder=2)
    stations_df_tmp.plot(ax=ax, color='black', marker='o', markersize=50, label='Station', zorder=4)
    ctx.add_basemap(ax, source=ctx.providers.CartoDB.PositronNoLabels, crs=tmp_df.crs)
    ax.set_title(f'Business Clusters by Agglomeration Type – Year: {year}', fontsize=14)
    ax.set_axis_off()
    categories = tmp_df['Agglomeration Type Sector'].unique()
    colors = [cmap(norm(cat)) if 'norm' in locals() else cmap(i / (len(categories)-1)) for i, cat in enumerate(categories)]
    patches = [mpatches.Patch(color=col, label=cat) for col, cat in zip(colors, categories)]
    ax.legend(handles=patches, title='Agglomeration Type', loc='upper left', fontsize=9, title_fontsize=10)

    plt.tight_layout()
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image